{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from cot import Collection\n",
    "from cot.generate import FRAGMENTS\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "from sprint_utils import get_answer_extraction, join_strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"commonsense_qa\", \"validation\"\n",
    "# \"med_qa\", \"test\"\n",
    "# \"medmc_qa\", \"validation\"\n",
    "# \"open_book_qa\", \"test\"\n",
    "# \"strategy_qa\", \"train\"\n",
    "# \"worldtree\", \"test\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# None,         # no cot-trigger\n",
    "# 'kojima-01',  # Answer: Let's think step by step.\n",
    "# 'kojima-02',  # Answer: First,\n",
    "# 'kojima-03',  # Answer: Let's think about this logically."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_sprint_thoughtsource(collection, dataset_name_splits, api_service_model, cot_trigger_key=None):\n",
    "    # collection = create_thoughtsource_100()\n",
    "    collection = collection\n",
    "    for dataset_name, split in dataset_name_splits:\n",
    "        print(dataset_name)\n",
    "        answer_extraction_key = get_answer_extraction(collection[dataset_name][split][0][\"type\"], collection[dataset_name][split][0][\"choices\"])\n",
    "        for api_service, model, api_time_interval in api_service_model:\n",
    "\n",
    "            config={\n",
    "                \"instruction_keys\": None,\n",
    "                \"cot_trigger_keys\": cot_trigger_key,\n",
    "                \"answer_extraction_keys\": [answer_extraction_key], # Therefore, among A through C/D/E/F, the answer is'\n",
    "                \"author\" : \"thoughtsource\",\n",
    "                \"api_service\": api_service,\n",
    "                \"engine\": model,\n",
    "                \"temperature\": 0,\n",
    "                \"max_tokens\": 512,\n",
    "                \"api_time_interval\": api_time_interval,\n",
    "                \"verbose\": False,\n",
    "                \"warn\": False,\n",
    "            }\n",
    "        \n",
    "            collection.generate(name=dataset_name, split=split,config=config)\n",
    "            collection.dump(\"extra_save_thoughtsource_100_\" + dataset_name + \"_huggingface_endpoint.json\")\n",
    "    collection.evaluate()\n",
    "    # write model name manually in here, or it gives an error because of the \"/\" in the model name = endpoint url in this case\n",
    "    collection.dump(\"thoughtsource_100\" + \"_\" + api_service + \"_\" + \"google_flan-t5-xxl\" + join_strings(config[\"cot_trigger_keys\"]) + \".json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ENDPOINT TEST\n",
    "endpoint_url = (\n",
    "    \"https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name_splits = [\n",
    "    # (\"commonsense_qa\", \"validation\"),\n",
    "    # (\"med_qa\", \"test\"),\n",
    "    # (\"medmc_qa\", \"validation\"),\n",
    "    (\"open_book_qa\", \"test\"),\n",
    "    (\"strategy_qa\", \"train\"),\n",
    "    (\"worldtree\", \"test\"),\n",
    "]\n",
    "\n",
    "# api_service_model = [(\"huggingface_hub\", \"google/flan-t5-xl\", 1)]\n",
    "api_service_model = [(\"huggingface_endpoint\", endpoint_url, 0)]\n",
    "# api_service_model = [(\"mock_api\", \"mock_model\", 0)]\n",
    "cot_trigger_key = [None, \"kojima-01\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from cot.create import create_thoughtsource_1\n",
    "# coll = create_thoughtsource_1()\n",
    "# coll.dump(\"thoughtsource_data/thoughtsource_1_empty.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/kon/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "# collection = Collection.from_json(\"thoughtsource_data/thoughtsource_1_empty.json\")\n",
    "collection = Collection.from_json(\"thoughtsource_data/thoughtsource_100_empty.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "open_book_qa\n",
      "Generating open_book_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a591e3e299074fed881e0617a6b518cc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "API-Error in item 36: Error raised by inference endpoint: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))\n",
      "Retrying with additional time of 10 seconds.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f16806880cdd4a22ad5f39a15573c0f6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "885994a54471416a81df6b1194ea1b47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "82f41c5906534fedb999e5364b8664eb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f0b1a6d3ba0741b18d98ea177fdb156f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e13019f294084cd397c8516028105131",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b033cd91b74146498143a474c4c5e747",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "strategy_qa\n",
      "Generating strategy_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "468bd5d672b14c88afb557624c4fa528",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5815e68c756e493188a35b2695c7019a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e9eb6f9421f047c8a81ce4f98387f6e0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "89a7f6bc30174531a4da623b612ad1dc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e0ee1eafa4c2454685419952f0497db1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e165f1a6d4cf4b3cb6132274d438c1fc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e4a1be3f46bf43daa5f56ee5a9982b46",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "worldtree\n",
      "Generating worldtree...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e47bca1d87ea4feb9203f207b2ef1140",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "API-Error in item 58: Error raised by inference API: Bad Gateway\n",
      "Retrying with additional time of 10 seconds.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c617365012ed42c2afa4aab6bd48cc02",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e1aa52961d91425191858b27c8d22fa4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bffdd70dbe1a4a8cbe065eff7d9804d4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "478ecebc611d4870989c0408558abfad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e5e7671d09cc44f1a946b9ff8340009a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "68acfb5eda8a44c481a795afbee53315",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating commonsense_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cc62135fefe24c0db1b8c52f93b681c5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating med_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b8d6a8a9c23e4e838138527b56acae38",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating medmc_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e2ccd16ba0b949948fc36c576f557b8b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating open_book_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1b02caccf1354d369030cd7a34597d53",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating strategy_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d049a05f75404b15bcbc2f3fe75aa542",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating worldtree...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e9ca0c031dc4282869dd755c65de08b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c0a01ba6e9c343e39d08090a1c059ec1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c0e45c110617401e898935e17d3f3729",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "595272945d4243888737db477e981f56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5544e809f10943279a478a77136f11b9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9dcff1f2dc60499b8bd5654786e14b24",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "06e0beb7c58e40ccaa474ef987d37d4c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model_sprint_thoughtsource(collection, dataset_name_splits, api_service_model, cot_trigger_key)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Error raised by inference endpoint: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating commonsense_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "255c1d314b4a4df5bfead8439277de47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating med_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f535b94fbf6e4ca5995beabafc861484",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating medmc_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "da7c8dd4ad8c4b96af0ae0c27371dc3c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating open_book_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b02d489cd28346e1bea73d0e0656e674",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating strategy_qa...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5c2412b1fa3a418c81ddba8cc3f02b7f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating worldtree...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1f554af2af02487584c3a8098fdc4fa9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/100 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #008000; text-decoration-color: #008000\">'commonsense_qa'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'validation'</span>: <span style=\"font-weight: bold\">{}}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #008000; text-decoration-color: #008000\">'med_qa'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'test'</span>: <span style=\"font-weight: bold\">{}}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #008000; text-decoration-color: #008000\">'medmc_qa'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'validation'</span>: <span style=\"font-weight: bold\">{}}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #008000; text-decoration-color: #008000\">'open_book_qa'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'test'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'accuracy'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'None_None_kojima-A-D'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.82</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'None_kojima-01_kojima-A-D'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.79</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #008000; text-decoration-color: #008000\">'strategy_qa'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'train'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'accuracy'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'None_None_kojima-yes-no'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.61</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'None_kojima-01_kojima-yes-no'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.69</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #008000; text-decoration-color: #008000\">'worldtree'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'test'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'accuracy'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud'</span>: <span style=\"font-weight: bold\">{</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'None_None_kojima-A-D'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.851485</span>,\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'None_kojima-01_kojima-A-D'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.8</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>\n",
       "<span style=\"font-weight: bold\">}</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   \u001b[0m\u001b[32m'commonsense_qa'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'validation'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   \u001b[0m\u001b[32m'med_qa'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'test'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   \u001b[0m\u001b[32m'medmc_qa'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'validation'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   \u001b[0m\u001b[32m'open_book_qa'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'test'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   \u001b[0m\u001b[32m'accuracy'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[32m'https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'None_None_kojima-A-D'\u001b[0m: \u001b[1;36m0.82\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'None_kojima-01_kojima-A-D'\u001b[0m: \u001b[1;36m0.79\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   \u001b[0m\u001b[32m'strategy_qa'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'train'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   \u001b[0m\u001b[32m'accuracy'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[32m'https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'None_None_kojima-yes-no'\u001b[0m: \u001b[1;36m0.61\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'None_kojima-01_kojima-yes-no'\u001b[0m: \u001b[1;36m0.69\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m,\n",
       "\u001b[2;32m│   \u001b[0m\u001b[32m'worldtree'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'test'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   \u001b[0m\u001b[32m'accuracy'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[32m'https://fcyrfns6if4vl115.us-east-1.aws.endpoints.huggingface.cloud'\u001b[0m: \u001b[1m{\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'None_None_kojima-A-D'\u001b[0m: \u001b[1;36m0.851485\u001b[0m,\n",
       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'None_kojima-01_kojima-A-D'\u001b[0m: \u001b[1;36m0.8\u001b[0m\n",
       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   │   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m\n",
       "\u001b[1m}\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "eval = collection.evaluate()\n",
    "pprint(eval)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "ccbfa654f25866afe66a1c016d0b518a994fbe20a93c2d8b432dbe114020e2d2"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
